import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsimport plotly.express as pxfrom plotly.subplots import make_subplotsimport plotly.graph_objs as go%matplotlib inline#read all datasetsmatch_data = pd.read_csv('Match_data.csv',encoding='latin1')#player dataplayer_time = pd.read_csv('player_playingtime1.csv')player_def =pd.read_csv('player_defense1.csv')player_keepers = pd.read_csv('player_keepers1.csv')player_shooting = pd.read_csv('player_shooting1.csv')player_stats = pd.read_csv('player_stats1.csv')player_misc = pd.read_csv('player_misc1.csv')xxxxxxxxxxData preprocessingData preprocessing
match_data.head(3)match_data.shape#rename columns to be clearmatch_data.rename(columns={'1':'home_team', '2':'away_team', '1_panelties_scored':'homeg_panelties', '2_panelties_scored':'awayg_panelties', '1_attempts':'home_attempts', '1_goals':'home_goals', '2_goals':'away_goals', '1_conceded':'home_conceded', '2_conceded':'away_conceded', '1_yellow_cards':'home_yellow_cards', '1_goal_prevented':'home_goal_prevented', '2_goal_prevented':'away_goal_prevented', '1_own_goal':'home_owngoals', '2_own_goal':'away_owngoals', '1_forced_turnovers':'home_forced_turnovers', '2_forced_turnovers':'away_forced_turnovers', '1_defensive_pressure_applied':'home_pressure', '2_defensive_pressure_applied':'away_pressure' },inplace=True)match_data.rename( columns={ '1_passes':'home_passes', '2_passes':'away_passes', '1_free_kicks':'home_free_kicks', '2_free_kicks':'away_free_kicks', '2_yellow_cards':'away_yellow_cards', '1_red_cards':'home_RedCards', '2_red_cards':'away_RedCards', '1_xg':'homeXG', '2_xg':'awayXG', },inplace=True)match_data.rename(columns={'1_goal_inside_penalty_area':'homeG_inside', '2_goal_inside_penalty_area':'awayG_inside', '1_goal_outside_penalty_area':'homeG_outside', '2_goal_outside_penalty_area':'awayG_outside', '1_poss':'home_poss', '2_poss':'away_poss', '1_offside':'home_offside', '2_offside':'away_offside', '1_corners':'home_corners', '2_corners':'away_corners', '2_attempts':'away_attempts', '1_ontarget':'home_ontarget', '2_ontarget':'away_ontarget', '1_offtarget':'home_offtarget', '2_offtarget':'away_offtarget', 'faul_against_1':'home_fauls', 'faul_against_2':'away_fauls', '1_passes_completed':'home_completed_passes', '2_passes_completed':'away_completed_passes' },inplace=True)match_data.columnsmatch_data.info()match_data['date'] = pd.to_datetime(match_data['date'])match_data['hour'] = pd.to_datetime(match_data['hour']).dt.hourmatch_data[['date','hour']].dtypesplayer_time['age'].head(2)player_time.dtypesplayer_time.columnsplayer_time[['age','un']]=player_time.age.str.split(pat='-',expand=True) player_def[['age','un']]=player_def.age.str.split(pat='-',expand=True) player_keepers[['age','un']]=player_keepers.age.str.split(pat='-',expand=True)player_misc[['age','un']]=player_misc.age.str.split(pat='-',expand=True)player_stats[['age','un']]=player_stats.age.str.split(pat='-',expand=True)player_shooting[['age','un']]=player_shooting.age.str.split(pat='-',expand=True)player_def.drop(['un'],axis=1,inplace=True)player_keepers.drop(['un'],axis=1,inplace=True)player_shooting.drop(['un'],axis=1,inplace=True)player_stats.drop([ 'un'],axis=1,inplace=True)player_misc.drop(['un'],axis=1,inplace=True)l1 =player_time.columnsl2 =player_shooting.columnsl3=player_misc.columnsl5=player_keepers.columnsl6=player_stats.columnsprint(len(l1),len(l2),len(l3),len(l5),len(l6))player_def.age=player_time.age.astype(int)player_keepers.age=player_time.age.astype(int)player_misc.age=player_time.age.astype(int)player_shooting.age=player_time.age.astype(int)player_stats.age=player_time.age.astype(int)xxxxxxxxxx<h1> Visualization </h1><ol> <li>Histogram</li> <li>Barplot</li> <li>Piechart</li> <li>TreeMap</li> <li>Heatmap</li> <li>ScatterPlot</li></ol>xxxxxxxxxx<h4>Histogram</h4><ul> <li> Using to show Distribution of numerical attributes </li></ul>fig = match_data.hist(figsize=(20,20))px.histogram(data_frame=player_shooting,x=player_shooting.goals,color='team',title='count of goals by teams')groups= player_misc.groupby(['team','player','position']).sum()[['aerials_lost']]groups.reset_index(inplace=True)mo=groups.where(groups.team=='Morocco').dropna()px.histogram(data_frame=mo,x=mo.player,y=mo.aerials_lost,color=mo.position)sns.histplot(data=match_data,x=match_data.home_goals,legend='home_team')xxxxxxxxxx<h2> this barplot shows every team with players postions mainly four different positions are found commonly in every team</h2>figure = px.bar(player_stats,x='team',color='position',barmode='group',title='Group Players by team and Postion')figure.show()#interactivematch_data['goals_total'] = match_data.home_goals+match_data.away_goalsGroups_total_goals = match_data.groupby(by='group').sum()[['goals_total']]Groups_total_goals.reset_index(inplace=True)Groups_total_goalspx.bar(data_frame=Groups_total_goals,x=Groups_total_goals.group, y=Groups_total_goals.goals_total, title='Total Goals Scored in each round')semi_final_data = match_data.where(match_data.group=='Semi-Final').dropna()sem = semi_final_data[['home_team','away_team','home_goals','away_goals','home_poss','away_poss']]semz = make_subplots(rows=1,cols=2,horizontal_spacing=0.2)z.add_trace(go.Bar(y=sem['home_team'],x=semi_final_data['home_poss'],orientation='h'))z.add_trace(go.Bar(y=sem['away_team'],x=sem['away_poss'],orientation='h'),col=2,row=1)passes=match_data[['home_team','away_team','home_poss','away_poss']]passes=passes[:10]i = make_subplots(rows=1,cols=2,row_heights=[10],column_widths=[10,10],horizontal_spacing=0.2)i.add_trace(go.Bar(x=passes['home_poss'],y=passes['home_team'],orientation='h',name='Home_team',text=passes['home_poss']),row=1,col=1)i.add_trace(go.Bar(x=passes['away_poss'],y=passes['away_team'],orientation='h',name='Away_team',text=passes['away_poss']),row=1,col=2)team_goals=match_data.groupby(by=('home_team')).sum()[['home_goals','home_attempts']].reset_index()team_goals_sorted = team_goals.sort_values('home_goals')fig1 =make_subplots(rows=2,cols=2,vertical_spacing=0.5)import plotly.graph_objects as gofig1.add_trace( go.Bar(x=team_goals.home_team,y=team_goals.home_goals,name='un sorted'), row=1, col=1)fig1.add_trace( go.Bar(x=team_goals_sorted.home_team,y=team_goals_sorted.home_goals,name='sorted'), row=1, col=2)fig1.add_trace( go.Bar(x=mo.player,y=mo.aerials_lost,name='Number of losses aerlies'), row=2, col=1)fig1.show()team1_cards=match_data.groupby(by=['home_team']).sum()[['home_RedCards','home_yellow_cards']]team1_cards.reset_index(inplace=True)team1_cards.sort_values(by='home_yellow_cards',inplace=True)xxxxxxxxxxteam1=team1_cards.melt('home_team',var_name='cols',value_name='vals')px.bar(team1,x=team1['home_team'], y=team1['vals'] , color='cols', barmode='group' , text_auto=True,title='number of yellow and red cards')#not interactivesum_atte=match_data.groupby(['venue','match_no']).sum()[['attendance']].reset_index()sum_atte=sum_atte.sort_values('attendance',ascending=False)sum_attefig=plt.scatter(x=sum_atte.match_no,y=sum_atte.attendance)px.scatter(x=sum_atte.match_no,y=sum_atte.attendance,color=sum_atte.venue)species=['homeXG','awayXG','home_goals','away_goals']matrix = pd.plotting.scatter_matrix(match_data[species],figsize=(12,12))gr=px.scatter(x=match_data.away_attempts,y=match_data.away_ontarget, title='correlation between #attempts and #balls on target')gr.show()px.scatter(data_frame=mo,x=mo.player, y=mo.aerials_lost, color=mo.position,size=mo.aerials_lost, facet_row='position', title='Number of Aerlis lost by morroc')players=player_shooting.groupby(['team','player']).sum()[['shots','goals']].reset_index()Ar=players.where(players.team=='Argentina').dropna()Fr=players.where(players.team=='France').dropna()gu = make_subplots(rows=1,cols=2,horizontal_spacing=0.1,vertical_spacing=0.5)gu.add_trace(go.Line(x=Ar.player,y=Ar.shots,name='Argentina Trace'),row=1,col=1)gu.add_trace(go.Line(x=Fr.player,y=Fr.shots,name='France Trace'),row=1,col=2)xxxxxxxxxx<h3> Dual-axis Line plot to see the changes about home_goals and away_goals</h3>fig, ax = plt.subplots(figsize=(12,5))ax2 = ax.twinx()ax.set_title('Ghanges of goals')ax.set_xlabel('Year')ax.plot(match_data['date'], match_data['home_goals'], color='green', marker='x')ax2.plot(match_data['date'], match_data['away_goals'], color='red', marker='o')ax.set_ylabel('goals_team1')ax2.set_ylabel('')ax.legend(['home_goals'])ax2.legend(['away_goals'], loc='upper center')plt.show()plt.figure(figsize=(8,8))fi=sns.lineplot(data=match_data,x='date',y='goals_total')matche_changes = match_data.groupby(['home_team']).sum()[['home_goals']]match_changest = match_data.groupby(['away_team']).sum()[['away_goals']]match_changest.reset_index()matche_changes.reset_index()li=matche_changes.home_goalsli2=match_changest.away_goalsli3=li+li2team_goals=pd.DataFrame(li3).reset_index()red_home = match_data.groupby('home_team').sum()[['home_RedCards']].reset_index()red_away = match_data.groupby('away_team').sum()[['away_RedCards']].reset_index()away_yellow=match_data.groupby('away_team').sum()[['away_yellow_cards']].reset_index()home_yellow=match_data.groupby('home_team').sum()[['home_yellow_cards']].reset_index()home_yellow['total']=home_yellow['home_yellow_cards']+away_yellow['away_yellow_cards']home_yellow['red']=red_home['home_RedCards']+red_away['away_RedCards']home_yellow.drop(['home_yellow_cards'],axis=1,inplace=True)xxxxxxxxxxcards signed by every teamcards signed by every team
team_goalsxxxxxxxxxxBoxplotBoxplot
px.box(team_goals,team_goals[0],title='Boxplot for each team goals')px.box(data_frame=player_keepers,y=player_keepers.gk_saves,color=player_keepers.team,title='GK_saves')sns.boxplot(team1.vals)match_data[['home_goals','away_goals']].boxplot()xxxxxxxxxxType Markdown and LaTeX:
team_palyer=player_stats.groupby(['team','player']).sum()[['goals']].reset_index()team_palyer.sort_values('goals',ascending=False,inplace=True)team_palyer = team_palyer[:10]xxxxxxxxxxVisualize top 10 players scored goalsVisualize top 10 players scored goals
px.pie(data_frame=team_palyer, names=team_palyer.player, values=team_palyer.goals, color=team_palyer.team,hole=0.1, hover_name='player')xxxxxxxxxxShow the number of attendence in each venueShow the number of attendence in each venue
fi = px.pie(data_frame=sum_atte, values=sum_atte.attendance, names= sum_atte.venue,color=sum_atte.match_no,hole=0.5, title='Venue with Total Attendence')fi.show()f=px.treemap(data_frame=team_goals,path=['home_team'],values=team_goals[0],title='Team Goals')f.show()xxxxxxxxxxfrom the above plot conclude that france is the first team in scoring goals with 16 goalsfrom the above plot conclude that france is the first team in scoring goals with 16 goals
px.treemap(match_data,path=['home_team','away_team'],values=match_data.home_goals)px.density_heatmap(match_data,x='goals_total',title='Total Number of goals',text_auto=True,marginal_x='histogram')px.density_heatmap(team_goals,x='home_team',title='Total Number of goals',text_auto=True,marginal_y='box')merged=pd.merge(player_stats,player_shooting)merg=merged.groupby('player').sum()[['games','assists']].reset_index()top_10=merg.sort_values('assists',ascending=False).head(10)xxxxxxxxxx<h1>pairplot for home_team and it's statistics</h1>mat = match_data[['home_team','homeXG','home_goals','home_yellow_cards']]sns.pairplot(mat)